Analysis of Immunomodulatory Staphylococcus aureus genes in the Human Microbiome Project (2012)

Author

Joe Boktor

Published

June 13, 2023

Background

Staphylococcus aureus has evolved a wide range of immune evasion mechanisms that allow it to colonize and infect the human host Jong et al. 2019. Due to its clinical relevance, S. aureus has been investigated for years and has led to the discovery of many of the effector molecules responsible for immune evasion. An open question is whether or not these immune evasion mechanisms are present in other commensal taxa that are common to the human microbiome, and if so, what is the distribution of these genes across human body sites?

The Human Microbiome Project (HMP) is a large-scale study that aims to characterize the human microbiome in healthy individuals. The HMP has generated a large amount of metagenomic data that may be leveraged to answer our questions regarding the distribution of S. aureus and other relavent taxa across human body sites.

Objectives

  1. Determine the distribution of S. aureus across human body sites.
  2. Determine if other taxa, common to the human microbiome, contain homologs of S. aureus immune modulation genes.
  3. Determine if S. aureus immune modulation genes are enriched in specific body sites.

Analysis

Code
library(tidyverse)
library(easystats)
library(magrittr)
library(forcats)
library(janitor)
library(reticulate)
library(glue)
library(seqinr)
library(future)
library(batchtools)
library(future.batchtools)
library(fs)
library(tictoc)
library(listenv)
library(progress)
library(viridis)
library(strex)
library(plotly)
library(ggsci)
library(data.table)
library(kableExtra)
library(curatedMetagenomicData)
library(DT)
library(mia)
library(scater)
library(vegan)
library(microbiome)
library(phyloseq)
# Plotting packages
library(ggpackets)
library(ggpointdensity)
library(ggside)
library(patchwork)
library(ggridges)
library(scales)
library(ggtree)
library(seriation)
library(aplot)

tmpdir <- "/central/scratch/jbok/tmp"
homedir <- "/central/groups/MazmanianLab/joeB"
wkdir <- glue(
  "{homedir}/",
  "Microbiota-Immunomodulation/Microbiota-Immunomodulation"
)
src_dir <- glue("{wkdir}/notebooks")
source(glue("{src_dir}/R-scripts/helpers_general.R"))
source(glue("{src_dir}/R-scripts/helpers_sequence-screens.R"))
source(glue("{src_dir}/R-scripts/rhmmer-package.R"))
genus_dir <- glue("{wkdir}/data/interim/refseq_genomes/genus_sampling")  
scratch_dir <- "/central/scratch/jbok"
sa_dir <- glue("{wkdir}/data/interim/saureus_analysis")
hhb_dir <- glue("{sa_dir}/hhblits")

Loading in curated Metagenomic Data from the Human Microbiome Project

Code
curatedMetagenomicData("HMP", dryrun = TRUE, rownames = "short")

hmp_metadata <- sampleMetadata %>%
  filter(grepl("HMP", study_name)) %>%
  select(where(~ !all(is.na(.x))))
hmp_metadata %>%
  group_by(study_name, body_site) %>%
  summarize(sample_counts = n())
hmp_2012_metadata <- sampleMetadata %>%
  filter(grepl("HMP_2012", study_name)) %>%
  select(where(~ !all(is.na(.x))))

ExperimentHub::setExperimentHubOption(
  "cache", "/central/groups/MazmanianLab/joeB/.cache/R/ExperimentHub"
)

hmp_2012_relab <-
  hmp_2012_metadata %>%
  returnSamples("relative_abundance", rownames = "short", counts = TRUE)
hmp_2012_relab_ncbi <-
  hmp_2012_metadata %>%
  returnSamples("relative_abundance", rownames = "NCBI", counts = TRUE)
hmp_2012_genefams <-
  hmp_2012_metadata %>%
  returnSamples("gene_families", rownames = "short", counts = TRUE)

saveRDS(
  hmp_2012_metadata,
  glue("{wkdir}/data/interim/curatedMetagenomicData/hmp_2012_metadata.rds")
)
saveRDS(
  hmp_2012_relab,
  glue("{wkdir}/data/interim/curatedMetagenomicData/hmp_2012_relab.rds")
)
saveRDS(
  hmp_2012_relab_ncbi,
  glue("{wkdir}/data/interim/curatedMetagenomicData/hmp_2012_relab_ncbi.rds")
)
saveRDS(
  hmp_2012_genefams,
  glue("{wkdir}/data/interim/curatedMetagenomicData/hmp_2012_gene.families.rds")
)

1. Human Microbiome Project EDA

Code
hmp_2012_metadata <- readRDS(
  glue("{wkdir}/data/interim/curatedMetagenomicData/hmp_2012_metadata.rds")
)
hmp_2012_relab <- readRDS(
  glue("{wkdir}/data/interim/curatedMetagenomicData/hmp_2012_relab.rds")
)
hmp_2012_relab_ncbi <- readRDS(
  glue("{wkdir}/data/interim/curatedMetagenomicData/hmp_2012_relab_ncbi.rds")
)
# split data by phylogeny
altExps(hmp_2012_relab) <- splitByRanks(hmp_2012_relab)
altExps(hmp_2012_relab_ncbi) <- splitByRanks(hmp_2012_relab_ncbi)
# create phyloseq
phy_hmp <- makePhyloseqFromTreeSummarizedExperiment(
  hmp_2012_relab,
  abund_values = "relative_abundance"
)
phy_hmp_ncbi <- makePhyloseqFromTreeSummarizedExperiment(
  hmp_2012_relab_ncbi,
  abund_values = "relative_abundance"
)
taxids_map <- data.frame(
  "sciname" = phy_hmp %>% taxa(),
  "taxid" = phy_hmp_ncbi %>% taxa()
) %>%
  mutate(sciname_formatted = sciname %>%
    str_replace_all("sp. ", "sp_") %>%
    str_replace_all(" ", "_") %>%
    str_replace_all(":", "_") %>%
    str_replace_all("\\[", "") %>%
    str_replace_all("\\]", ""))
Code
hmp_2012_metadata %>%
  group_by(body_site) %>%
  summarize(samples = n()) %>% 
  print_md()
HMP 2012 body site sample counts
body_site samples
nasalcavity 93
oralcavity 414
skin 27
stool 147
vagina 67
Code
hmp_2012_metadata %>%
  group_by(body_site, body_subsite) %>%
  summarize(samples = n()) %>% 
  print_md()
HMP 2012 body sub-site sample counts
body_site body_subsite samples
nasalcavity anterior_nares 93
oralcavity buccal_mucosa 119
oralcavity hard_palate 1
oralcavity keratinized_gingiva 6
oralcavity palatine_tonsils 6
oralcavity saliva 5
oralcavity subgingival_plaque 7
oralcavity supragingival_plaque 127
oralcavity throat 7
oralcavity tongue_dorsum 136
skin l_retroauricular_crease 9
skin r_retroauricular_crease 18
stool stool 147
vagina mid_vagina 2
vagina posterior_fornix 62
vagina vaginal_introitus 3
Code
hmp_2012_metadata %>%
  DT::datatable(options = list(scrollX = TRUE))
Code
alpha_stats <- microbiome::alpha(phy_hmp, index = "observed")
alpha_stats_df <- alpha_stats %>%
  bind_cols(meta(phy_hmp))

p_alpha <- alpha_stats_df %>%
  ggplot(aes(body_site, observed)) +
  geom_point(
    aes(color = body_subsite),
    position = position_jitter()) +
  geom_boxplot(alpha = 0.3, outlier.alpha = 0) +
  theme_bw() +
  scale_color_viridis_d() +
  labs(x = NULL, y = "Observed Species") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplotly(p_alpha)

Figure 1: Alpha diversity of HMP 2012 samples

Code
umap <- hmp_2012_relab %>%
  runUMAP(
    exprs_values = "relative_abundance",
    altexp = "species", name = "UMAP"
    ) %>%
  plotReducedDim("UMAP", colour_by = "body_subsite", shape_by = "body_site") +
    labs(x = "UMAP 1", y = "UMAP 2")

ggplotly(umap)

Figure 2: UMAP of HMP 2012 species relative abundance data. Shape indicates body-site, and colors indicate by body subsite.

Code
library(microshades)
library(speedyseq)

mdf_prep <- prep_mdf(phy_hmp, subgroup_level = "genus")
color_objs_phyhmp <- create_color_dfs(mdf_prep,
  group_level = "phylum",
  subgroup_level = "genus",
  selected_groups = 
    c('Proteobacteria', 'Actinobacteria', 'Bacteroidota', 'Firmicutes'),
  cvd = TRUE
)
# Extract
hmp_ordered <- reorder_samples_by(
  mdf_hmp_unordered <- color_objs_phyhmp$mdf, 
  color_objs_phyhmp$cdf,
  group_level = "phylum",
  subgroup_level = "genus",
  order = "Streptococcus"
)
mdf_hmp <- hmp_ordered$mdf
cdf_hmp <- hmp_ordered$cdf

plot_1 <- plot_microshades(mdf_hmp, cdf_hmp)
final_barplot <- plot_1 +
  facet_grid(~body_site, scales = "free", space = "free") +
  scale_y_continuous(labels = scales::percent, expand = expansion(0)) +
  theme(
    legend.key.size = unit(0.2, "cm"),
    text = element_text(size = 10),
    axis.text.x = element_blank(),
    legend.position = "bottom",
  )

ggsave(
  glue("{wkdir}/figures/hmp_analysis/{Sys.Date()}_phylum-genus-barplot.png"),
  final_barplot,
  width = 15,
  height = 10,
  dpi = 600
)

Figure 3: Relative abundance barplot of top Genera colored as gradients within a Phyum.

2. Body Site Distribution of S. aureus

Code
staph_abundance_df <- phy_hmp %>%
  microbiome::transform("compositional") %>%
  abundances() %>%
  as.data.frame() %>%
  rownames_to_column("taxa") %>%
  filter(grepl("Staphylococcus", taxa)) %>%
  pivot_longer(-taxa, names_to = "sample_id", values_to = "abundance") %>%
  dplyr::left_join(hmp_2012_metadata)

p_staph_abundance <- staph_abundance_df %>%
  ggplot(aes(x = body_site, y = abundance)) +
  geom_point(aes(color = taxa), alpha = 0.6, position = position_jitter()) +
  theme_bw() +
  labs(x = NULL, y = "Relative Abundance")

ggplotly(p_staph_abundance)

Figure 4: Relative HMP 2012 relative abundance data colored by body site

4. Relative abundance of S. aureus UniProt genes in dataset

Table adopted from Jong et al. 2019

Code
staph_proteins <- tribble(
  ~Abbreviation, ~Full_name, ~Evades_process, ~MGEc,
  "Aur", "Aureolysin", "III, V", "",
  "CHIPS", "Chemotaxis inhibitory protein of Staphylococcus", "II", "IEC-1/ɸSa3",
  "Cna", "S. aureus collagen adhesion", "III", "",
  "Eap", "Extracellular adherence protein and extracellular adherence protein homologue", "I, V", "",
  "Ecb", "Extracellular complement-binding protein", "III", "IEC-2",
  "Efb", "Extracellular fibrinogen-binding protein", "III", "IEC-2",
  "FLIPr", "FPR2 inhibitory protein", "II", "IEC-2",
  "Hla", "Hemolysin-alpha", "V", "",
  "Hlg", "Hemolysin-gamma", "V", "",
  "Luk", "Leukocidin", "V", "ɸSa2",
  "Nuc", "Nuclease", "IV", "",
  "PSMs", "Phenol-soluble modulins", "V", "",
  "SAgs", "Superantigens", "V", "IEC1, SaPIn1, vSaβd",
  "SAK", "Staphylokinase", "III, V", "IEC-1/ɸSa3",
  "Sbi", "Staphylococcal binding of IgG", "III", "",
  "SCIN", "Staphylococcal complement inhibitor", "III", "IEC-1/ɸSa3",
  "ScpA", "Staphopain A", "II", "",
  "SdrE", "Surface-associated serine-aspartate repeat protein E", "III", "",
  "SelX", "Staphylococcal enterotoxin-like X", "I", "",
  "SpA", "Staphylococcal protein A", "III", "",
  "SPIN", "Staphylococcal peroxidase inhibitor", "V", "vSaαd",
  "SSL", "Staphylococcal superantigen-like 1-14", "I, II, III", "vSaαd/vSaβd"
)

staph_proteins %>%
  print_md()
Abbreviation Full_name Evades_process MGEc
Aur Aureolysin III, V
CHIPS Chemotaxis inhibitory protein of Staphylococcus II IEC-1/ɸSa3
Cna S. aureus collagen adhesion III
Eap Extracellular adherence protein and extracellular adherence protein homologue I, V
Ecb Extracellular complement-binding protein III IEC-2
Efb Extracellular fibrinogen-binding protein III IEC-2
FLIPr FPR2 inhibitory protein II IEC-2
Hla Hemolysin-alpha V
Hlg Hemolysin-gamma V
Luk Leukocidin V ɸSa2
Nuc Nuclease IV
PSMs Phenol-soluble modulins V
SAgs Superantigens V IEC1, SaPIn1, vSaβd
SAK Staphylokinase III, V IEC-1/ɸSa3
Sbi Staphylococcal binding of IgG III
SCIN Staphylococcal complement inhibitor III IEC-1/ɸSa3
ScpA Staphopain A II
SdrE Surface-associated serine-aspartate repeat protein E III
SelX Staphylococcal enterotoxin-like X I
SpA Staphylococcal protein A III
SPIN Staphylococcal peroxidase inhibitor V vSaαd
SSL Staphylococcal superantigen-like 1-14 I, II, III vSaαd/vSaβd

Gene names were used to query UniProt while applying the following taxonomic filter (S. aureus (S. aureus/Staph. aureus) [1280]), and all results were downloaded as tab-delimited text files.

Code
# read tables in one by one and search for hits in genetables
staph_uniref_paths <- list.files(glue("{wkdir}/data/input/S.aureus_UniRef90"),
  full.names = TRUE
)
staph_uniref_dfs <-
  staph_uniref_paths %>%
  purrr::set_names(~ basename(.)) %>%
  purrr::map(read_tsv) %>%
  bind_rows(.id = "file") %>%
  mutate(Abbvreviation = strex::str_before_first(file, "_")) %>%
  mutate(staph_uniprot_entry_id = glue("UniRef90_{Entry}"))
saveRDS(
  staph_uniref_dfs,
  glue(
    "{sa_dir}/{Sys.Date()}_manual_UniRef_genefamily_search_results.rds"
  )
)

Loading in the HMP 2012 gene families table and extracting UniRef90 IDs.

Code
hmp_2012_genefams <- readRDS(
  glue("{wkdir}/data/interim/curatedMetagenomicData/hmp_2012_gene.families.rds")
)
genefam_rows <- assays(hmp_2012_genefams)$gene_families %>% rownames
uniref90_genefams <- genefam_rows %>% keep(!grepl("\\|", .))
uniref90_genefams %<>% keep(!grepl("UNMAPPED", .))
saveRDS(
  genefam_rows,
  glue(
    "{wkdir}/data/interim/curatedMetagenomicData/",
    "hmp_2012_genefamily_all_uniref90_ids.rds"
  )
)
saveRDS(
  uniref90_genefams,
  glue(
    "{wkdir}/data/interim/curatedMetagenomicData/",
    "hmp_2012_genefamily_uniref90_ids.rds"
  )
)

Utility functions to collect hits from UniRef90 tables and convert sparse matrices to dataframes.

Code
collect_hits_from_df <- function(path) {
  df_uniref <- read_tsv(path)
  uniref_ids <- paste0("UniRef90_", df_uniref$Entry)
  searchres <- genefam_rows %in% uniref_ids
  if (!any(searchres)) {
    return(NULL)
  }
  hits_founds <- assays(hmp_2012_genefams)$gene_families[searchres, ]
  return(hits_founds)
}
dgTMatrix_to_dataframe <- function(mat) {
  cols <- colnames(mat)
  rows <- rownames(mat)
  dfres <- Matrix::summary(mat) %>%
    as.data.frame()
  dfres %>%
    mutate(
      i = purrr::map(i, ~ rows[.x]),
      j = purrr::map(j, ~ cols[.x])
    ) %>%
    return()
}
possibly_dgTMatrix_to_dataframe <-
  possibly(dgTMatrix_to_dataframe, otherwise = NULL)
Code
hits_scraped <- staph_uniref_paths %>%
  purrr::set_names(~ basename(.)) %>%
  purrr::map(~ collect_hits_from_df(.))
all_hits_df <-
  purrr::map_df(hits_scraped, possibly_dgTMatrix_to_dataframe)

saveRDS(
  all_hits_df,
  glue("{wkdir}/data/interim/curatedMetagenomicData/hmp2012_staph_hits.rds")
)
Code
staph_gene_hits <- readRDS(
  glue("{wkdir}/data/interim/curatedMetagenomicData/hmp2012_staph_hits.rds")
)
staph_gene_hits %<>%
  dplyr::rename(
    staph_uniprot_entry_id = i,
    staph_uniprot_entry_abund = x,
    sample_id = j,
  ) %>%
  mutate_if(is.list, as.character)

p_staph_uniref_hits <- staph_gene_hits %>%
  dplyr::left_join(hmp_2012_metadata, by = "sample_id") %>%
  dplyr::left_join(staph_uniref_dfs, by = "staph_uniprot_entry_id") %>%
  ggplot(aes(x = body_site, y = staph_uniprot_entry_abund)) +
  geom_point(
    aes(colour = Abbvreviation),
    position = position_jitter()
  ) +
  scale_y_log10() +
  labs(color = "Gene") +
  scale_color_viridis_d(option = "H") +
  theme_bw() +
    labs(x=NULL, y="UniRef90 Relative Abundance")

ggplotly(p_staph_uniref_hits)

Direct UniRef90 hits of S. aureus genes in the HMP 2012 data set

Code
# ggsave(
#   glue("{wkdir}/figures/s.aureus-blastp/Staph_Uniref90_direct-search.png"),
#   p_staph_uniref_hits,
#   width = 6, height = 6, dpi = 300
# )

5. RefSeq Select Prot BLAST to identify taxa which contain homologs of S. aureus immune evasion genes

Downloading the RefSeq Select Protein BLAST DB

Code
db_dir <- "/central/groups/MazmanianLab/joeB/Downloads/RefDBs/blastdbs/refseq_select_prot/"

for (n in str_pad((0:21), width = 2, pad = "0")) {
  jname <- glue("refseq_select_prot_{n}")
  ftp_base <- "https://ftp.ncbi.nlm.nih.gov/blast/db"
  dlink1 <- glue("{ftp_base}/refseq_select_prot.{n}.tar.gz")
  dlink2 <- glue("{ftp_base}/refseq_select_prot.{n}.tar.gz.md5")
  wget_download_slurm(
    jobname = jname,
    slurm_out = "/central/scratch/jbok/slurmdump/",
    walltime = "30:00",
    download_link = dlink1,
    output_dir = db_dir
  )
  shell_do(glue("wget -P {db_dir} {dlink2}"))
}

shell_do(glue(
  "wget -P {db_dir}",
  " https://ftp.ncbi.nlm.nih.gov/blast/db/refseq_select_prot-prot-metadata.json"
))

# once complete, untar gz files
list.files(db_dir, pattern = ".tar.gz$", full.names = TRUE) %>%
  purrr::map(~ untar(tarfile = .x, exdir = db_dir))

# delete gz files
list.files(db_dir, pattern = ".tar.gz$", full.names = TRUE) %>%
  purrr::map(~ file.remove(.x))

Retrieving FASTA sequences for each accession ID

Code
accessionIDs <- staph_uniref_dfs$Entry %>% unique()
future::plan(multisession, workers = 16)
fastaSequences <- furrr::future_map(accessionIDs, get_fasta_from_UniProt)

# Save FASTA sequences to individual files
for (i in 1:length(accessionIDs)) {
  accession <- accessionIDs[i]
  fasta <- fastaSequences[[i]]
  filename <- glue(
    "{wkdir}/data/interim/saureus_analysis/",
    "staph_virulence_fastas/{accession}.fasta"
  )
  writeLines(fasta, filename)
  cat("FASTA sequence for", accession, "saved as", filename, "\n")
}

Prepping batch input dataframe for slurm parallelization

Code
results_dir <-
  glue(
    "{wkdir}/data/interim/saureus_analysis/",
    "{Sys.Date()}_RefSeqSelectProt-blastp"
  )
dir.create(
  results_dir,
  showWarnings = FALSE,
  recursive = TRUE
)
# create batch_input_df for slurm
fasta_paths <- list.files(
  glue("{wkdir}/data/interim/saureus_analysis/staph_virulence_fastas"),
  full.names = TRUE
)
# list fasta files of interest
batch_input_df <- data.frame(
  input_fasta = fasta_paths
  ) %>%
  mutate(
    output_path =
      strex::str_after_last(input_fasta, "/") %>%
        map_chr(~ glue("{results_dir}/{.}_blastp.out")),
    working_dir = wkdir,
    refdb = glue(
      "{homedir}/Downloads/RefDBs/blastdbs/",
      "refseq_select_prot/refseq_select_prot"
    ),
    output_cols = glue(
      "qaccver saccver staxids",
      " pident length mismatch gapopen qstart",
      " qend sstart send evalue bitscore"
    )
  ) %>%
    filter(!file.exists(output_path))
batch_input_df %>% glimpse

Executing parallelized blastp

Code
# configure registry ----
cluster_run <- glue("{get_time()}_blastp/")
message("\n\nRUNNING:  ", cluster_run, "\n")
breg <- makeRegistry(
  file.dir = glue(
    "{wkdir}/.cluster_runs/",
    cluster_run
  ),
  seed = 42
)
breg$cluster.functions <- batchtools::makeClusterFunctionsSlurm(
  template = glue("{wkdir}/batchtools_templates/batchtools.slurm.tmpl"),
  scheduler.latency = 0.05,
  fs.latency = 65
)
# Submit Jobs ----
jobs <- batchMap(
  fun = run_blastp_custom,
  args = batch_input_df,
  reg = breg
)
submitJobs(jobs,
  resources = list(
    walltime = 3600,
    memory = 16000,
    ncpus = 2,
    max.concurrent.jobs = 9999
  )
)

Reading in and formatting blastp results

Code
# Reading in results
blastp_res_paths <- list.files(
  glue(
    "{wkdir}/data/interim/saureus_analysis/",
    "2023-06-19_RefSeqSelectProt-blastp"
  ),
  full.names = TRUE
)
possibly_read_delim <- possibly(read.delim)
blastp_res <- blastp_res_paths %>%
  purrr::set_names(~ basename(.)) %>%
  purrr::map(~ possibly_read_delim(., header = FALSE) ) %>%
  bind_rows(.id = "file") %>%
  as_tibble()
colnames(blastp_res) <- c(
  "file", "qaccver", "saccver", "staxids",
  "pident", "length", "mismatch", "gapopen", "qstart",
  "qend", "sstart", "send", "evalue", "bitscore"
)
blastp_res %<>%
  mutate(entry_name = strex::str_after_last(qaccver, "\\|"))

# organize Entry IDs by the Staph protein they match
staph_uniref_metadata <- staph_uniref_dfs %>%
  janitor::clean_names() %>%
  select(-c(file, length))

blastp_res_annot <- blastp_res %>%
  filter(pident > 50, evalue < 1e-5 ) %>%
  dplyr::left_join(
    staph_uniref_metadata,
    by = "entry_name", relationship = "many-to-many"
  )

# save a list of unique taxids with a hit for each gene
taxid_hits_by_gene <- unique(blastp_res_annot$abbvreviation) %>%
  purrr::set_names() %>%
  purrr::map(~ filter(blastp_res_annot, abbvreviation == .) %>%
    pull(staxids) %>%
    purrr::map(~ unlist(str_split(., ";"))) %>%
    unlist() %>%
    unique())
saveRDS(
  taxid_hits_by_gene,
  glue("{wkdir}/data/interim/saureus_analysis/taxid_hits_by_gene.rds")
)

In this section we vizualize the blastp results of our UniRef90 gene ID hits. We filter results to maintain only hits with >50% identity and an E-value <1e-5. We then use the NCBI taxonomy database to map the taxids of our hits to their respective species.

We then use the phyloseq package to create a phylogenetic tree of the species which are detected in the Human Microbiome Project 2012 dataset. We then display the center-log ration (CLR) abundances of each taxa by body-site alongside the detection of each gene of interest by taxid. To assist in visualization and remove taxa abundances which are unlikely to be present, we replace CLR values below -1.5 with -10.

Code
taxid_hits_by_gene <- readRDS(
  glue("{wkdir}/data/interim/saureus_analysis/taxid_hits_by_gene.rds")
)

ps_species <- phy_hmp_ncbi %>%
  microbiome::core(detection = 0, prevalence = 1e-10)
ps_species_merged <- ps_species %>%
  merge_samples("body_site", fun = mean) %>%
  microbiome::transform("clr")
  # microbiome::transform(" ")
phyloseq::sample_data(ps_species_merged)$body_site <-
  rownames(phyloseq::sample_data(ps_species_merged))

heatmap_data <- ps_species_merged %>%
  abundances() %>%
  as.data.frame() %>%
  mutate_if(is.numeric, ~case_when(
    . < -1.5 ~ -10,
    TRUE ~ .
  ))

p <- ggtree(phyloseq::phy_tree(ps_species_merged)) +
    geom_tiplab(size=0.2, align=TRUE, linesize=.5)

p_heatmap <- gheatmap(
  p, heatmap_data,
  offset = 0.1, width = 0.6, colnames = FALSE, color = NULL
  ) +
  scale_x_ggtree() +
    scale_fill_viridis_c(option = "F") +
    labs(fill = "CLR \nabundance") +
    theme(
      panel.grid = element_blank(),
      plot.margin = unit(c(1, 1, 1, 1), "cm"),
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10)
    )

# Create binary table with presence/absence of each gene per species
species_detection_df <- names(taxid_hits_by_gene) %>%
  purrr::set_names() %>%
  purrr::map_dfr(~ (taxa(ps_species_merged) %in% taxid_hits_by_gene[[.x]])) %>% 
  as.data.frame()

# number of species with genes detected
colSums(species_detection_df)
species_detection_df %<>% mutate(taxa = taxa(ps_species_merged))
rownames(species_detection_df) <- species_detection_df$taxa

saveRDS(
  species_detection_df,
  glue("{wkdir}/data/interim/saureus_analysis/species_detection_df.rds")
  )

detection_heatmap <- species_detection_df %>%
  pivot_longer(!taxa, names_to = "gene", values_to = "detected") %>%
  ggplot(aes(x = gene, y = taxa, fill = detected)) +
  geom_tile() +
  scale_fill_manual(values = c("TRUE" = "#333333", "FALSE" = "#F2F2F2")) +
  labs(x = NULL, y = NULL) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10),
    axis.text.y = element_blank()
  )

p_gene_hit_counts <- species_detection_df %>%
  pivot_longer(!taxa, names_to = "gene", values_to = "detected") %>%
  mutate(detected = as.numeric(detected)) %>%
  group_by(gene) %>%
  summarize(count = sum(detected)) %>%
  ggplot() +
  theme_bw() +
  geom_col(aes(x = gene, y = count), fill = "#333333") +
  scale_y_log10() +
  labs(x = NULL, y = NULL) +
  theme(
    axis.text.x = element_blank(),
    panel.grid = element_blank(), 
    panel.background = element_rect(fill = "#F2F2F2")
  )
detection_heatmap_lab <-
  detection_heatmap %>% insert_top(p_gene_hit_counts, height = 0.1)
finalheatmap <-
  detection_heatmap_lab %>% insert_left(p_heatmap, width = 1.5)

ggsave(
  glue("{wkdir}/figures/s.aureus-blastp/",
  "phylo-heatmap_hmp-species-abund-staph-gene-detection.png"),
  finalheatmap, dpi = 600,
  width = 13, height = 16
  )

Here we provide a rough estimate of taxa origin and attempt to attribute taxa to specific body sites. We calculate the prevalence of each taxa within each body site followed by a calculation the mean prevalence of each taxa across all body sites. Taxa with prevalence values below the mean are zeroed out, and the remaining taxa are classified as present or absent in each body site.

Code
body_site_list <- meta(phy_hmp_ncbi) %>%
  pull(body_site) %>% 
  unique()

prevalence_res <- list()
for (site in body_site_list){
  prevalence_res[[site]] <- phy_hmp_ncbi %>%
    phyloseq::subset_samples(body_site == site) %>%
    microbiome::prevalence()
}
taxa_prev_df <- bind_cols(prevalence_res) %>%
  mutate(taxa = taxa(phy_hmp_ncbi))
prev_max <- taxa_prev_df %>%
  pivot_longer(!taxa) %>%
  group_by(taxa) %>%
  summarize(max_prev = max(value))

taxa_prev_means <- taxa_prev_df %>%
  pivot_longer(!taxa) %>%
  group_by(taxa) %>%
  summarise(mean_prev = mean(value))

taxa_prev_df %<>%
  dplyr::left_join(taxa_prev_means) %>%
    mutate_if(is.numeric, ~ case_when(
    . < mean_prev ~ 0,
    TRUE ~ .
  ))

body_site_attr <- taxa_prev_df %>%
  select(-mean_prev) %>%
  pivot_longer(-taxa) %>%
  filter(value > 0) %>%
  select(-value) %>% 
  nest(.by = taxa)

body_site_attr_df <- body_site_attr %>%
  mutate(
    source = purrr::map(
      data, ~ unlist(.) %>%
        unname(.) %>%
        paste(., collapse = "_&_")
    ),
    source = as.character(source)
  ) %>%
  select(-data)
body_site_attr_df$source %>% unique
saveRDS(
  body_site_attr_df,
  glue(
    "{wkdir}/data/interim/curatedMetagenomicData/",
    "hmp_2012_body_site_attr_df.rds"
  )
)

Visualizing taxa across body sites for each gene of interest.

Code
body_site_attr_df <- readRDS(
  glue(
    "{wkdir}/data/interim/curatedMetagenomicData/",
    "hmp_2012_body_site_attr_df.rds"
  )
)
species_detection_df <- readRDS(
  glue("{wkdir}/data/interim/saureus_analysis/species_detection_df.rds")
  )
p_tissue_gene_heatmap <- species_detection_df %>%
  dplyr::left_join(body_site_attr_df) %>%
  pivot_longer(-c(taxa, source), names_to = "gene", values_to = "detected") %>%
  group_by(gene, source) %>%
  summarize(count = sum(detected)) %>%
  mutate(annot = ifelse(count > 0, count, NA)) %>%
  ggplot(aes(x = source, y = gene, fill = count)) +
  geom_tile() +
  geom_text(aes(label = annot), color = "white") +
  theme_minimal() +
  scale_fill_viridis(option = "H") +
  labs(x = NULL, y= NULL, fill = "Gene hits") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid = element_blank()
  )

ggsave(
  glue("{wkdir}/figures/s.aureus-blastp/body-site-gene-detection-heatmap.png"),
  p_tissue_gene_heatmap,
  width = 6, height = 6
)

Figure 6: Body Site detection of S. aureus immunomodulatory proteins

6. Profile Hidden Markov Model (pHMM) search of UniRef90 genes to identify homologs in the HMP 2012 dataset.

The previous analysis identifies taxa whose representative genome displays gene-homology to some S. aureus genes of interest. However, this analysis does not provide direct detection of the genes of interest in the HMP dataset.

To address this concern, we generated pHMMs of each relavent UniRef90 gene within a category using HHblits against the UniRef30 database. Next, the pHMMs are used to search UniRef90 for homologs. Homologous entries are then searched for in the HMP 2012 dataset. This analysis generates UniRef90 gene families with sequence similarity to S. aureus genes of interest, and further displays relative gene abundance measurements for each gene detected in the HMP, as well as the taxa that they were found in.

flowchart LR
A(S.aureus \nUniRef90 Gene \n Fasta) --> |HHblits \n UniRef30 DB| B(hhblits hits \na3m)
B(hhblits hits \na3m) --> |reformat \n a3m to fasta| C[hhblits hits \nfasta]
C[hhblits hits \nfasta] --> |hhbuild| D[profile Hidden\n Markov Model \npHMM]
D[profile Hidden\n Markov Model \npHMM] --> |hhsearch \n UniRef90| E[hmmsearch hits]
E[hmmsearch hits] --> |Human Microbiome \n Project 2012| F[UniRef90 abundance]  
B(hhblits hits \na3m) --> |hhfilter \n select most diverse seqs| G[hhblits hits \ntrimmed \na3m]
G[hhblits hits \ntrimmed \na3m] --> |reformat \n a3m to fasta| H[hhblits hits \ntrimmed \nfasta]
H[hhblits hits \ntrimmed \nfasta] --> |QC Visualization| I[MSA Report]

hhblits S. aureus fasta files

Code
# list of all s. aureus fasta files
input_fasta_paths <- list.files(
  glue("{sa_dir}/staph_virulence_fastas"),
  full.names = TRUE
)
dir.create(hhb_dir, recursive = TRUE, showWarnings = FALSE)
for (f in input_fasta_paths) {
  fname <- fs::path_ext_remove(basename(f))
  if (file.exists(glue("{hhb_dir}/{fname}.hmm"))) {
    message(glue("{fname} already exists, skipping..."))
    next
  }
  print(fname)
  blits_cmds <- glue(
    "hhblits -cpu 12",
    " -i {f}",
    " -d {homedir}/Downloads/RefDBs/",
    "HHsuite3db/UniRef30_2022_02/UniRef30_2022_02",
    " -o {hhb_dir}/{fname}.hhr",
    " -oa3m {hhb_dir}/{fname}.a3m",
    " -n 2", # number of iterations
    " -diff 1000", # maximum number of sequences to keep in MSA
    " -id 90" # maximum similarity to input sequence
  )
  slurm_shell_do(
    cmd = blits_cmds,
    memory = "10G",
    ncpus = 4,
    walltime = 3600
  )
}

Filtering HHblits hits to remove redundant sequences for visual QC.

Code
# Initiate future.batchtools backend for parallel processing
future::plan(
  future.batchtools::batchtools_slurm,
  template = glue("{wkdir}/batchtools_templates/batchtools.slurm.tmpl"),
  resources = list(
    name = "HHblits-MSA-trim",
    memory = "5G",
    ncpus = 1,
    walltime = 3600
  )
)

slurm_do_msa_trim <- function(
    fname_list,
    out_dir = hhb_dir,
    working_dir = wkdir) {
  require(glue)
  source(glue("{working_dir}/notebooks/R-scripts/helpers_general.R"))
  message(get_time(), " Data dir: ", out_dir)
  for (path in fname_list) {
    fname <- fs::path_ext_remove(path)
    if (file.exists(glue("{out_dir}/{fname}.trimD30.fasta"))) {
      message(glue("{get_time()} {fname} already exists, skipping..."))
      next
    }
    message(glue("{get_time()} {fname} processing..."))
    shell_do(
      glue(
        "hhfilter -i {out_dir}/{fname}.a3m ",
        "-o {out_dir}/{fname}.trimD30.a3m -diff 30 &&",
        " reformat.pl -r {out_dir}/{fname}.trimD30.a3m ",
        "{out_dir}/temp_{fname}.trimD30.fasta &&",
        " mamba run -n pdmbsR seqkit rename ",
        "{out_dir}/temp_{fname}.trimD30.fasta > ",
        "{out_dir}/{fname}.trimD30.fasta"
      )
    )
    shell_do(glue("rm {out_dir}/temp_{fname}.trimD30.fasta"))
  }
}

# list A3M files and trim to the top 30 most diverse sequences 
a3m_paths <- list.files(hhb_dir, pattern = ".a3m") %>%
  keep(!grepl("trimD30", .))
# Chunk files (100 per job) and download
tic()
n_jobs <- ceiling(length(a3m_paths) / 20)
trim_msa_runs <- listenv()
for (job in 1:n_jobs) {
  fnames <- chunk_func(a3m_paths, n_jobs)[[job]]
  trim_msa_runs[[job]] %<-% slurm_do_msa_trim(fname_list = fnames)
}
toc()

Convert full-length A3M files to HMMs for HMMER3 search.

Code
slurm_do_convert_a3m_msa_hmm <- function(
    fname_list,
    out_dir = hhb_dir,
    working_dir = wkdir) {
  require(glue)
  source(glue("{working_dir}/notebooks/R-scripts/helpers_general.R"))
  for (path in fname_list) {
    fname <- fs::path_ext_remove(path)
    if (file.exists(glue("{out_dir}/{fname}.hmm"))) {
      message(glue("{get_time()} {fname} already exists, skipping..."))
      next
    }
    message(glue("{get_time()} {fname} processing..."))
    shell_do(
      glue(
        "reformat.pl -r {out_dir}/{fname}.a3m ",
        "{out_dir}/temp_{fname}.fasta &&",
        " mamba run -n pdmbsR seqkit rename ",
        "{out_dir}/temp_{fname}.fasta > ",
        "{out_dir}/{fname}.fasta &&",
        " hmmbuild {out_dir}/{fname}.hmm {out_dir}/{fname}.fasta"
      )
    )
    shell_do(glue("rm {out_dir}/temp_{fname}.fasta"))
  }
}


# list A3M files and trim to the top 30 most diverse sequences
a3m_paths <- list.files(hhb_dir, pattern = ".a3m") %>%
  keep(!grepl("trimD30", .))

# Chunk files (100 per job) and download
tic()
n_jobs <- ceiling(length(a3m_paths) / 20)
trim_msa_runs <- listenv()
for (job in 1:n_jobs) {
  fnames <- chunk_func(a3m_paths, n_jobs)[[job]]
  trim_msa_runs[[job]] %<-% slurm_do_convert_a3m_msa_hmm(fname_list = fnames)
}
toc()

Execute Hmmsearch on UniRef90 database.

Code
slurm_do_hmmsearch <- function(
    input_paths,
    output_dir,
    db_path,
    working_dir = wkdir) {
  require(glue)
  require(fs)
  source(glue("{working_dir}/notebooks/R-scripts/helpers_general.R"))
  for (hmm_path in input_paths) {
    id <- fs::path_ext_remove(basename(hmm_path))
    if (file.exists(glue("{output_dir}/{id}_hmmsearch.out"))) {
      message(glue("{id} already exists, skipping..."))
      next
    }
    hmmsearch_cmd <- glue(
      "hmmsearch ",
      " --cpu 12",
      " -o {output_dir}/{id}_hmmsearch.out",
      " --tblout {output_dir}/{id}_tblout.txt",
      " --domtblout {output_dir}/{id}_domtblout.txt",
      " --noali",
      " {hmm_path}",
      " {db_path}"
    )
    shell_do(hmmsearch_cmd)
  }
}
# Initiate future.batchtools backend for parallel processing
future::plan(
  future.batchtools::batchtools_slurm,
  template = glue("{wkdir}/batchtools_templates/batchtools.slurm.tmpl"),
  resources = list(
    name = "Hmmer3 search",
    memory = "5G",
    ncpus = 4,
    walltime = 36000
  )
)
# list HMM files
hmm_paths <- list.files(hhb_dir, pattern = ".hmm", full.names = TRUE)
sa_hmmsearch <- glue("{sa_dir}/hmmsearch")
dir.create(sa_hmmsearch, showWarnings = FALSE)
hmm_paths %<>% keep(!file.exists(
  glue("{sa_dir}/hmmsearch/{fs::path_ext_remove(basename(.))}_tblout.txt")
))


# Chunk files (100 per job) and download
tic()
n_jobs <- ceiling(length(hmm_paths) / 1)
hmmsearch_runs <- listenv()
for (job in 1:n_jobs) {
  hmm_paths_chunk <- chunk_func(hmm_paths, n_jobs)[[job]]
  hmmsearch_runs[[job]] %<-% slurm_do_hmmsearch(
    input_paths = hmm_paths_chunk,
    output_dir = sa_hmmsearch,
    db_path = "/central/software/alphafold/data/uniref90/uniref90.fasta"
  )
}
toc()

Formatting hmmsearch output for downstream analysis.

Code
# Searching for Uniref90 hits in HMP genomes
hmmer_results_paths <- list.files(
  glue("{sa_dir}/hmmsearch"),
  full.names = TRUE, pattern = "_tblout.txt"
  ) %>%
  keep(file.size(.) > 0)

# load in hmmersearch results
hmmer_results_list <- hmmer_results_paths %>% #sample(hmmer_results_paths, 50) %>%
  purrr::set_names(gsub("_tblout.txt", "", basename(.))) %>%
  purrr::map(~ read_tblout(.) %>% filter(sequence_evalue < 0.05))
saveRDS(
  hmmer_results_list,
  glue("{sa_dir}/{Sys.Date()}_hmmersearch_results.rds")
)

hmmer_results_list <- readRDS(
  glue("{sa_dir}/2023-06-27_hmmersearch_results.rds")
)

#' utility function to match a list of UniRef90 IDs to the HMP dataset UniRef90 IDs
#' and return the matches
get_hmp_uniref90_matches <- function(input_list) {
  u90_matches <- (uniref90_genefams %in% input_list)
  hits <- uniref90_genefams[u90_matches]
  return(hits)
}

# generate a list of UniRef90 hits that exist in the HMP dataset
staph_gene_uniref90_matches <- hmmer_results_list %>%
  purrr::map( ~ get_hmp_uniref90_matches(.x$domain_name))

staph_gene_uniref90_matches_df <- staph_gene_uniref90_matches %>%
  purrr::map(~ data.frame("UniRef90_Hit_ID" = .)) %>%
  dplyr::bind_rows(.id = "Entry") %>%
  dplyr::left_join(staph_uniref_dfs)

saveRDS(
  staph_gene_uniref90_matches_df,
  glue("{sa_dir}/{Sys.Date()}_HMP-available-homologs-annotated.rds")
)

# extract abundance values for each UniRef90 hit
genefam_hit_rows_lgl <-
  genefam_rows %in% unique(staph_gene_uniref90_matches_df$UniRef90_Hit_ID)
hmp_genefam_hits_df <-
  assays(hmp_2012_genefams)$gene_families[genefam_hit_rows_lgl, ] %>%
  dgTMatrix_to_dataframe()
saveRDS(
  hmp_genefam_hits_df,
  glue("{sa_dir}/{Sys.Date()}_HMP_homologous_genefamilies.rds")
)

Visualzing results

Code
staph_uniref_dfs <- readRDS(
  glue("{sa_dir}/2023-06-28_manual_UniRef_genefamily_search_results.rds")
)
uniref90_genefams <- readRDS(
  glue(
    "{wkdir}/data/interim/curatedMetagenomicData/",
    "hmp_2012_genefamily_uniref90_ids.rds"
  )
)
# table mapping the Original UniRef90 Hits from the manual gene-name search to the gene-family name, and the UniRef90 Homologs found through the pHMM hmmsearch
staph_gene_uniref90_matches_df <- readRDS(
  glue("{sa_dir}/2023-06-27_HMP-available-homologs-annotated.rds")
)
# HMP metadata and gene-family values for the UniRef90 homologs found in the HMP dataset
hmp_genefam_hits_df <- readRDS(
  glue("{sa_dir}/2023-06-27_HMP_homologous_genefamilies.rds")
)

hmp_genefam_hits_df %<>%
  dplyr::rename(
    UniRef90_Hit_ID = i,
    value = x,
    sample_id = j,
  ) %>%
  mutate_if(is.list, as.character)

hmp_genefam_hits_df %<>%
  dplyr::left_join(hmp_2012_metadata, 
    by = "sample_id", relationship = "many-to-many") %>%
  dplyr::left_join(staph_gene_uniref90_matches_df, 
    by = "UniRef90_Hit_ID", relationship = "many-to-many")
hmp_genefam_hits_df %>% glimpse

hmp_genefam_hits_df_stats <- hmp_genefam_hits_df %>%
  group_by(sample_id, body_site, Abbvreviation) %>%
  summarize(
    value = sum(value)
  )

p_ridge <- hmp_genefam_hits_df_stats %>%
  filter(Abbvreviation != "SAK") %>%
  group_by(Abbvreviation) %>% 
  mutate(text = fct_reorder(body_site, value)) %>%
  ggplot(aes(y = body_site, x = value, fill = body_site)) +
  geom_density_ridges(alpha = 0.6, panel_scaling = FALSE, scale = 4) +
  theme_ridges() +
  scale_x_log10() +
  scale_fill_d3() +
  labs(x="Cumulative Gene Family Abundance", y = NULL, fill = "Body Site") +
  facet_grid(rows = vars(Abbvreviation), scales = "free", space = "free") +
  theme(
    legend.position = "top",
    axis.title.x = element_text(hjust = 0.5),
    panel.spacing = unit(0.1, "lines"),
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(size = 8)
  )

ggsave(
  file = glue("{wkdir}/figures/s.aureus-blastp/HMP-cumulative-gene-family-abundance.png"),
  p_ridge,
  width = 10, 
  height = 14
)

Figure 7: Cumulative Sum of Gene Family Homologs (UniRef90), direct measurements in samples

For each measured UniRef90 homolog, the HUMANn3 pipeline will provide species-level annotation when possible. We would like to definitively determine which species the homologs of interest are expressed in and which tissues these species, carrying the homologs, are found in.

Here we collect the species-level annotations for each UniRef90 homolog measured in the HMP dataset. For each UniRef90, grep matching UIDs that contain species specific information.

Code
hmp_uid_hits <- hmp_genefam_hits_df$UniRef90_Hit_ID %>% unique
genefam_rows <- readRDS(
  glue(
    "{wkdir}/data/interim/curatedMetagenomicData/",
    "hmp_2012_genefamily_all_uniref90_ids.rds"
  )
)
grepl_matches <- function(query, search_list) {
  matches <- grepl(query, search_list)
  hits <- search_list[matches]
  return(hits)
}
grepl_matches_wrapper <- function(query_list, search_list, outfile) {
  require(purrr)
  hits <- query_list %>%
    purrr::set_names() %>%
    purrr::map(~ grepl_matches(., search_list))
  saveRDS(hits, file = outfile)
}

# Initiate future.batchtools backend for parallel processing
future::plan(
  future.batchtools::batchtools_slurm,
  template = glue("{wkdir}/batchtools_templates/batchtools.slurm.tmpl"),
  resources = list(
    name = glue("{get_time()}_grep-UniRef90-UIDs"),
    memory = "5G",
    ncpus = 1,
    walltime = 7200
  )
)
# 1.5gb  limit (1500*1024^2 = 1572864000)
options(future.globals.maxSize= 1572864000)

# Chunk files (100 per job) and download
n_jobs <- ceiling(length(hmp_uid_hits) / 200)
listenv_res <- listenv()
tic()
for (job in 1:n_jobs) {
  output_path <- glue("{sa_dir}/hmp_homolog_name_grep/{job}.rds")
  if (file.exists(output_path)) {
    next
  }
  message("Executing job ", job)
  chunk_input <- chunk_func(hmp_uid_hits, n_jobs)[[job]]
  listenv_res[[job]] %<-% grepl_matches_wrapper(
    query_list = chunk_input,
    search_list = genefam_rows,
    outfile = output_path
  )
}
toc()

Aggregating and formatting grep results

Code
sa_unirep_name_paths <- list.files(
  glue("{sa_dir}/hmp_homolog_name_grep"),
  pattern = "*.rds",
  full.names = TRUE
)

# read in list of all UniRef90 IDs with taxonomic resolution
sa_unirep_taxa_strat <- 
  purrr::map(sa_unirep_name_paths, readRDS) %>%
  unlist() %>%
  unname() %>%
  unique() %>%
  keep(!grepl("unclassified", .))

# turn into a data.frame with gene ID + taxonomic resolution
sa_unirep_taxa_strat_df <-
  data.frame("full_name" = sa_unirep_taxa_strat) %>%
  separate(full_name, c("UniRef90_Hit_ID", "taxa"), sep = "\\|") %>%
  group_by(UniRef90_Hit_ID) %>%
  mutate(entry_count = n()) %>%
  ungroup()

saveRDS(sa_unirep_taxa_strat_df,
  file = glue("{sa_dir}/hmp_homolog_names_df.rds")
)

Visualizing summary stats for species-level resolution of gene-abundance

Code
sa_unirep_taxa_strat_df <- readRDS(
  file = glue("{sa_dir}/hmp_homolog_names_df.rds")
)
# ECDF Plot 
  sa_unirep_taxa_strat_df %>%
    select(UniRef90_Hit_ID, entry_count) %>%
    distinct() %>%
    ggplot(aes(entry_count-1)) +
    theme_bw() +
    labs(
      x = "Number of taxonomic assignments per gene", 
      y = "ECDF"
      ) +
    stat_ecdf(geom = "point") +
    stat_ecdf(geom = "step")

Figure 8: Emperical Cumulative Distribution Function (ECDF) of the number of taxonomic assignments per gene.

Key Takeaways:

  1. Slightly over 50% of UniRef90 IDs have only no taxonomic assignment
  2. The remaining UniRef90 IDs have only one assignment with a few exceptions.

Next we will visually explore the taxonomic assignments for each gene category of interest, summing across individual homolgs.

Code
detected_taxa <- sa_unirep_taxa_strat_df %>%
  mutate(sciname_formatted = strex::str_after_first(taxa, "s__"))
intersect(
  taxids_map$sciname_formatted,
  unique(detected_taxa$sciname_formatted)
)
# 505 out of 582 detected taxa have a taxid in the phyloseq object
setdiff(
  unique(detected_taxa$sciname_formatted),
  taxids_map$sciname_formatted
)
#' these 72 can't be found in the species abundance table, 
#' maybe the pipeline wasn't able to find their core marker genes?
Code
ps_species <- phy_hmp_ncbi %>%
  microbiome::core(detection = 0, prevalence = 1e-10)
ps_species_merged <- ps_species %>%
  merge_samples("body_site", fun = mean) %>%
  microbiome::transform("clr") 
phyloseq::sample_data(ps_species_merged)$body_site <-
  rownames(phyloseq::sample_data(ps_species_merged))

heatmap_data <- ps_species_merged %>%
  abundances() %>%
  as.data.frame() %>%
  mutate_if(is.numeric, ~case_when(
    . < -1.5 ~ -10,
    TRUE ~ .
  ))

p <- ggtree(phyloseq::phy_tree(ps_species_merged)) +
    geom_tiplab(size=0.2, align=TRUE, linesize=.5)

p_heatmap <- gheatmap(
  p, heatmap_data,
  offset = 0.1, width = 0.6, colnames = FALSE, color = NULL
  ) +
  scale_x_ggtree() +
    scale_fill_viridis_c(option = "F") +
    labs(fill = "CLR \nabundance") +
    theme(
      panel.grid = element_blank(),
      plot.margin = unit(c(1, 1, 1, 1), "cm"),
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10)
    )

detected_taxa_with_taxids <- detected_taxa %>%
  dplyr::left_join(taxids_map, by = "sciname_formatted")

taxids_per_genecategory <- detected_taxa_with_taxids %>%
  drop_na(taxid) %>%
  dplyr::left_join(staph_gene_uniref90_matches_df,
    by = "UniRef90_Hit_ID", relationship = "many-to-many"
  ) %>%
  dplyr::select(Abbvreviation, taxid) %>%
    distinct()

species_detection_df <- data.frame("taxid" = taxa(ps_species_merged)) %>%
  dplyr::full_join(taxids_per_genecategory, by = "taxid") %>%
  mutate(x = TRUE) %>%
  pivot_wider(names_from = Abbvreviation, values_from = x, values_fill = FALSE) %>%
  dplyr::rename(taxa = taxid) %>%
  select(-c("NA"))

# # number of species with genes detected
species_detection_df %>%
  select_if(is.logical) %>%
  colSums()

saveRDS(
  species_detection_df,
  glue("{wkdir}/data/interim/saureus_analysis/species_detection_df_homologs_measured.rds")
  )

detection_heatmap <- species_detection_df %>%
  pivot_longer(!taxa, names_to = "gene", values_to = "detected") %>%
  ggplot(aes(x = gene, y = taxa, fill = detected)) +
  geom_tile() +
  scale_fill_manual(values = c("TRUE" = "#333333", "FALSE" = "#F2F2F2")) +
  labs(x = NULL, y = NULL) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10),
    axis.text.y = element_blank()
  )

p_gene_hit_counts <- species_detection_df %>%
  pivot_longer(!taxa, names_to = "gene", values_to = "detected") %>%
  mutate(detected = as.numeric(detected)) %>%
  group_by(gene) %>%
  summarize(count = sum(detected)) %>%
  ggplot() +
  theme_bw() +
  geom_col(aes(x = gene, y = count), fill = "#333333") +
  scale_y_log10() +
  labs(x = NULL, y = NULL) +
  theme(
    axis.text.x = element_blank(),
    panel.grid = element_blank(), 
    panel.background = element_rect(fill = "#F2F2F2")
  )
detection_heatmap_lab <-
  detection_heatmap %>% insert_top(p_gene_hit_counts, height = 0.1)
finalheatmap <-
  detection_heatmap_lab %>% insert_left(p_heatmap, width = 1.5)

ggsave(
  glue("{wkdir}/figures/s.aureus-blastp/",
  "phylo-heatmap_hmp-species-abund-staph-gene-Homologs-measured.png"),
  finalheatmap, dpi = 600,
  width = 13, height = 16
  )

Key Takeaways:

  1. Fifteen out of the 21 S. aureus genes families of interest display homology in at least one other species within the dataset. Six genes are found only in S. aureus.
  2. The following seven genes (Aur, Cna, Nuc, SAgs, ScpA, SdrE, and SpA), display homologs in ~100+ different species. The remaining eight genes (PSMs, ssl, Ecb, Efb, Hla, Sbi, Luk, and SelX) display homologs in less than 10 species.
  3. Few species display homology detection patterns similar to S. aureus, with all or even many of the gene families.

Visualizing direct abundance values of homologs genes, across body sites, for each staph gene of interest.

Code
bodysite_gene_levels <- hmp_genefam_hits_df %>%
  group_by(body_site, Abbvreviation) %>%
  dplyr::summarize(abundance_sum = sum(value))
  
p_direct_gene_abundance_heatmap <- 
  bodysite_gene_levels %>%
    ggplot(aes(x = body_site, 
    y = Abbvreviation, fill = log10(abundance_sum)) #fill = abundance_sum) #
    ) +
  geom_tile() +
  # geom_text(aes(label = annot), color = "white") +
  theme_minimal() +
  scale_fill_viridis(option = "F") +
  labs(x = NULL, y= NULL, fill = expression(log[10] ~ "(summed CPM gene counts)")) +
  theme(
    legend.position = "top",
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid = element_blank()
  )

ggsave(
  glue("{wkdir}/figures/s.aureus-blastp/body-site-gene-homologs-measured-detection-heatmap.png"),
  p_direct_gene_abundance_heatmap,
  width = 4, height = 6
)

Figure 10: Species Body Site Distribution. Species contain measured homologs of S. aureus immunomodulatory proteins

Key Takeaways:

  1. All tissues display some gene-level abundance of homologs. The nasal and oral cavities harbor the highest abundances and prevalence of homologs, followed by the skin, stool, and vagina.
  2. In stool we detect homologs of nine out of the 22 gene-families of interest.

Conclusions

  1. S. aureus is detected in the Human Microbiome Project 2012 dataset in the nasal cavity, oral cavity, and at low abundances in skin.
  2. Using UniProt90 as a reference database, we can detect 11 out of 22 genes of interest in the HMP 2012 dataset. However, these gene IDs are likely specific to S. aureus.
  3. In our search for homologous genes and the taxa which represent them, we find that Nucleases (Nuc) and Staphopain A (ScpA) are the most prevalent in other species. All 22 genes display some level of detection in other species, the majority of which are resident to the nasal cavity, oral cavity, and skin; However, the stool samples dislay enrichment for five different genes, including Staphopain A (ScpA), Surface-associated serine-aspartate repeat protein E (SdrE), Staphylococcal superantigen-like protein (ssl), Nucleases, and S. aureus collagen adhesion (Cna).
  4. Using a more sensitive homolog-centered analysis to obtain UniRef90 gene-families in other species, which strongly resemble S. aureus proteins, we find a bimodal distribution of homologs. The majority of genes display homologs in less than 10 species, while a few genes display homologs in over 100 species.
  5. When directly quantifying homology gene-level abundance across body-sites, we find that the stool does indeed harbor a significant fraction of homologous S. aureus proteins of interest.

Session Info

Code
sessionInfo()
R version 4.2.0 (2022-04-22)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /central/groups/MazmanianLab/joeB/software/mambaforge/envs/pdmbsR/lib/libopenblasp-r0.3.23.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] aplot_0.1.10                   seriation_1.4.2               
 [3] ggtree_3.6.2                   scales_1.2.1                  
 [5] ggridges_0.5.4                 patchwork_1.1.2               
 [7] ggside_0.2.2                   ggpointdensity_0.1.0          
 [9] ggpackets_0.2.1                microbiome_1.20.0             
[11] phyloseq_1.42.0                vegan_2.6-4                   
[13] lattice_0.21-8                 permute_0.9-7                 
[15] scater_1.26.1                  scuttle_1.8.4                 
[17] mia_1.6.0                      MultiAssayExperiment_1.24.0   
[19] DT_0.28                        curatedMetagenomicData_3.6.2  
[21] TreeSummarizedExperiment_2.6.0 Biostrings_2.66.0             
[23] XVector_0.38.0                 SingleCellExperiment_1.20.1   
[25] SummarizedExperiment_1.28.0    Biobase_2.58.0                
[27] GenomicRanges_1.50.2           GenomeInfoDb_1.34.9           
[29] IRanges_2.32.0                 S4Vectors_0.36.2              
[31] BiocGenerics_0.44.0            MatrixGenerics_1.10.0         
[33] matrixStats_1.0.0              kableExtra_1.3.4              
[35] data.table_1.14.8              ggsci_3.0.0                   
[37] plotly_4.10.2                  strex_1.6.0                   
[39] viridis_0.6.3                  viridisLite_0.4.2             
[41] progress_1.2.2                 listenv_0.9.0                 
[43] tictoc_1.2                     fs_1.6.2                      
[45] future.batchtools_0.12.0       parallelly_1.36.0             
[47] batchtools_0.9.17              future_1.32.0                 
[49] seqinr_4.2-30                  glue_1.6.2                    
[51] reticulate_1.30-9000           janitor_2.2.0                 
[53] magrittr_2.0.3                 see_0.8.0                     
[55] report_0.5.7                   parameters_0.21.1             
[57] performance_0.10.4             modelbased_0.8.6              
[59] insight_0.19.2                 effectsize_0.8.3              
[61] datawizard_0.8.0               correlation_0.8.4             
[63] bayestestR_0.13.1              easystats_0.6.0               
[65] lubridate_1.9.2                forcats_1.0.0                 
[67] stringr_1.5.0                  dplyr_1.1.2                   
[69] purrr_1.0.1                    readr_2.1.4                   
[71] tidyr_1.3.0                    tibble_3.2.1                  
[73] ggplot2_3.4.2                  tidyverse_2.0.0               

loaded via a namespace (and not attached):
  [1] estimability_1.4.1            rappdirs_0.3.3               
  [3] coda_0.19-4                   bit64_4.0.5                  
  [5] knitr_1.42                    irlba_2.3.5.1                
  [7] multcomp_1.4-24               DelayedArray_0.24.0          
  [9] KEGGREST_1.38.0               RCurl_1.98-1.12              
 [11] generics_0.1.3                ScaledMatrix_1.6.0           
 [13] cowplot_1.1.1                 TH.data_1.1-2                
 [15] RSQLite_2.3.1                 bit_4.0.5                    
 [17] tzdb_0.4.0                    base64url_1.4                
 [19] webshot_0.5.4                 xml2_1.3.4                   
 [21] httpuv_1.6.11                 DirichletMultinomial_1.40.0  
 [23] xfun_0.39                     jquerylib_0.1.4              
 [25] hms_1.1.3                     TSP_1.2-4                    
 [27] evaluate_0.21                 promises_1.2.0.1             
 [29] fansi_1.0.4                   dbplyr_2.3.2                 
 [31] igraph_1.5.0                  DBI_1.1.3                    
 [33] htmlwidgets_1.6.2             ellipsis_0.3.2               
 [35] crosstalk_1.2.0               backports_1.4.1              
 [37] sparseMatrixStats_1.10.0      vctrs_0.6.3                  
 [39] cachem_1.0.8                  withr_2.5.0                  
 [41] vroom_1.6.3                   checkmate_2.2.0              
 [43] emmeans_1.8.6                 treeio_1.22.0                
 [45] prettyunits_1.1.1             svglite_2.1.1                
 [47] cluster_2.1.4                 ExperimentHub_2.6.0          
 [49] ape_5.7-1                     lazyeval_0.2.2               
 [51] crayon_1.5.2                  labeling_0.4.2               
 [53] pkgconfig_2.0.3               nlme_3.1-162                 
 [55] vipor_0.4.5                   rlang_1.1.1                  
 [57] globals_0.16.2                lifecycle_1.0.3              
 [59] sandwich_3.0-2                registry_0.5-1               
 [61] filelock_1.0.2                BiocFileCache_2.6.1          
 [63] rsvd_1.0.5                    AnnotationHub_3.6.0          
 [65] Matrix_1.5-4.1                Rhdf5lib_1.20.0              
 [67] zoo_1.8-12                    beeswarm_0.4.0               
 [69] png_0.1-8                     ca_0.71.1                    
 [71] bitops_1.0-7                  rhdf5filters_1.10.1          
 [73] blob_1.2.4                    DelayedMatrixStats_1.20.0    
 [75] decontam_1.18.0               brew_1.0-8                   
 [77] gridGraphics_0.5-1            DECIPHER_2.26.0              
 [79] beachmat_2.14.2               memoise_2.0.1                
 [81] plyr_1.8.8                    zlibbioc_1.44.0              
 [83] compiler_4.2.0                snakecase_0.11.0             
 [85] cli_3.6.1                     ade4_1.7-22                  
 [87] MASS_7.3-60                   mgcv_1.8-42                  
 [89] tidyselect_1.2.0              stringi_1.7.12               
 [91] yaml_2.3.7                    BiocSingular_1.14.0          
 [93] ggrepel_0.9.3                 grid_4.2.0                   
 [95] sass_0.4.6                    tools_4.2.0                  
 [97] timechange_0.2.0              parallel_4.2.0               
 [99] rstudioapi_0.14               foreach_1.5.2                
[101] gridExtra_2.3                 farver_2.1.1                 
[103] Rtsne_0.16                    digest_0.6.31                
[105] BiocManager_1.30.21           FNN_1.1.3.2                  
[107] shiny_1.7.4                   Rcpp_1.0.10                  
[109] BiocVersion_3.16.0            later_1.3.1                  
[111] httr_1.4.6                    AnnotationDbi_1.60.2         
[113] colorspace_2.1-0              rvest_1.0.3                  
[115] splines_4.2.0                 uwot_0.1.14                  
[117] yulab.utils_0.0.6             tidytree_0.4.2               
[119] multtest_2.54.0               ggplotify_0.1.0              
[121] systemfonts_1.0.4             xtable_1.8-4                 
[123] jsonlite_1.8.5                ggfun_0.0.9                  
[125] R6_2.5.1                      pillar_1.9.0                 
[127] htmltools_0.5.5               mime_0.12                    
[129] fastmap_1.1.1                 BiocParallel_1.32.6          
[131] BiocNeighbors_1.16.0          interactiveDisplayBase_1.36.0
[133] codetools_0.2-19              mvtnorm_1.2-2                
[135] utf8_1.2.3                    bslib_0.5.0                  
[137] curl_5.0.1                    ggbeeswarm_0.7.2             
[139] survival_3.5-5                rmarkdown_2.22               
[141] biomformat_1.26.0             munsell_0.5.0                
[143] rhdf5_2.42.1                  GenomeInfoDbData_1.2.9       
[145] iterators_1.0.14              reshape2_1.4.4               
[147] gtable_0.3.3